In [1]:
from bertopic import BERTopic

topic_model = BERTopic.load("/home/zhhuang/climate_policy_paper/code/model_save/bert_topic_country_expand_model")
/home/zhhuang/anaconda3/envs/climatepolicy/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
In [2]:
import pandas as pd

df = pd.read_excel("/home/zhhuang/climate_policy_paper/code/data/Topic_docs_time_country_expand.xlsx")
docs, timestamp = df["docs"].to_list(), df["Year"].to_list()
In [3]:
topic_model.get_topic_info()['Topic']
topic_model.get_document_info(docs)[["Topic", "Name", "Top_n_words", "Probability", "Representative_document"]]
Out[3]:
Topic Name Top_n_words Probability Representative_document
0 -1 -1_energy_development_management_project energy - development - management - project - ... 0.788525 False
1 -1 -1_energy_development_management_project energy - development - management - project - ... 0.342139 False
2 -1 -1_energy_development_management_project energy - development - management - project - ... 0.503221 False
3 -1 -1_energy_development_management_project energy - development - management - project - ... 0.100914 False
4 -1 -1_energy_development_management_project energy - development - management - project - ... 0.649000 False
... ... ... ... ... ...
68334 -1 -1_energy_development_management_project energy - development - management - project - ... 0.556607 False
68335 -1 -1_energy_development_management_project energy - development - management - project - ... 0.827829 False
68336 -1 -1_energy_development_management_project energy - development - management - project - ... 0.790249 False
68337 -1 -1_energy_development_management_project energy - development - management - project - ... 0.518765 False
68338 -1 -1_energy_development_management_project energy - development - management - project - ... 0.557813 False

68339 rows × 5 columns

In [4]:
counts = {}
for doc in docs:
    for word in doc.split():
        counts[word] = counts.get(word, 0) + 1
items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True)
for i in range(100):
    word, count = items[i]
    print("{0:<10}{1:>5}".format(word, count))
energy    36106
land      26216
development23921
plan      16674
forest    16482
national  16031
establish 15315
environmental15090
management14829
agricultural14231
activity  13924
public    13387
purpose   12978
article   12923
policy    12503
protection12038
resource  11674
production11124
measure   11046
project   10876
system    10747
emission  10632
sector    10620
provide   10296
procedure  9784
power      9776
objective  9773
environment 9730
product    9677
promote    9493
set        9430
water      9235
control    9002
gas        8972
include    8909
condition  8700
natural    8678
support    8539
regulation 8268
consist    8167
sustainable 8086
economic   8072
implementation 7909
efficiency 7882
service    7862
aim        7807
requirement 7789
renewable  7406
application 7261
country    7211
electricity 7184
ensure     7145
agreement  7117
organization 6941
rule       6866
standard   6865
implement  6717
government 6717
relate     6699
carry      6595
regulate   6522
develop    6519
framework  6369
action     6365
legal      6238
grant      6229
operation  6224
establishes 6187
plant      6125
authority  6050
program    5975
increase   5928
strategy   5905
level      5888
market     5833
rural      5755
function   5743
protect    5738
building   5695
person     5630
conservation 5606
process    5556
``         5525
create     5488
term       5463
international 5460
minister   5430
source     5393
improve    5308
right      5287
investment 5258
local      5256
fuel       5172
climate    5167
quality    5142
waste      5124
tax        5104
reduce     5096
technical  5029
issue      4997
In [5]:
similar_topics, similarity = topic_model.find_topics("Transport", top_n=5)
topic_model.get_topic(similar_topics[0])
Out[5]:
[('energy', 0.03230173130614682),
 ('adverse', 0.013766813290960673),
 ('modification', 0.013384015874985505),
 ('environment', 0.012575852856891081),
 ('hfc', 0.011520532554952024),
 ('assessment', 0.01110828408978698),
 ('ozone', 0.011048527597892805),
 ('transboundary', 0.010538213919751344),
 ('objective', 0.010021317175811909),
 ('tariff', 0.009750261281022469)]
In [6]:
similar_topics, similarity = topic_model.find_topics("Industry", top_n=5)
topic_model.get_topic(similar_topics[0])
Out[6]:
[('architectural', 0.0240691542320956),
 ('territory', 0.022780009435618136),
 ('county', 0.02132256745756662),
 ('municipal', 0.02049388887075923),
 ('neighbourhood', 0.018203884240013932),
 ('village', 0.016495837073308834),
 ('architecture', 0.016006323381752823),
 ('territorial', 0.013546296462238094),
 ('district', 0.013263537723284712),
 ('municipality', 0.012657715219287767)]
In [7]:
similar_topics, similarity = topic_model.find_topics("Energy systems", top_n=5)
topic_model.get_topic(similar_topics[0])
Out[7]:
[('biodiversity', 0.17937641584550487),
 ('biological', 0.050332011052528866),
 ('sectoral', 0.022775396263003653),
 ('strategic', 0.02205042804451),
 ('conserve', 0.019934529526093942),
 ('specie', 0.01947446686530582),
 ('management', 0.019097955609743363),
 ('genetic', 0.017997541413598488),
 ('equitable', 0.01706068145479469),
 ('objective', 0.01614543846081185)]
In [8]:
similar_topics, similarity = topic_model.find_topics("Buildings", top_n=5)
topic_model.get_topic(similar_topics[0])
Out[8]:
[('architectural', 0.0240691542320956),
 ('territory', 0.022780009435618136),
 ('county', 0.02132256745756662),
 ('municipal', 0.02049388887075923),
 ('neighbourhood', 0.018203884240013932),
 ('village', 0.016495837073308834),
 ('architecture', 0.016006323381752823),
 ('territorial', 0.013546296462238094),
 ('district', 0.013263537723284712),
 ('municipality', 0.012657715219287767)]
In [9]:
similar_topics, similarity = topic_model.find_topics("AFOLU", top_n=5)
topic_model.get_topic(similar_topics[0])
Out[9]:
[('phytosanitary', 0.12690569974001198),
 ('cadmium', 0.041165131035834794),
 ('dalbergia', 0.041088957935167134),
 ('nickel', 0.03952115331890425),
 ('biocidal', 0.038387871497668556),
 ('polycyclic', 0.03649408337469938),
 ('eucalyptus', 0.03169599469361471),
 ('ambient', 0.027820439480866342),
 ('dariniensis', 0.022501541167033873),
 ('establishes', 0.01995691635840361)]
In [10]:
len(docs)
Out[10]:
68339
In [11]:
import os
images_path = "/home/zhhuang/climate_policy_paper/paper_images"
if not os.path.exists(images_path):
    os.makedirs(images_path)
In [12]:
import plotly.io as pio
pio.kaleido.scope.default_format = "svg"
# pio.kaleido.scope.mathjax = "https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js
In [13]:
fig = topic_model.visualize_barchart(top_n_topics=40, n_words=10, width=300, height=300)
pio.write_image(fig, '/home/zhhuang/climate_policy_paper/paper_images/topic_country_expand_barchart.svg')

# fig = topic_model.visualize_barchart(top_n_topics = 20, n_words=10, width = 300, height= 300)
# fig.write_html("/home/zhhuang/climate_policy_paper/paper_images/topic_barchart.png", engine="kaleido")
# img_bytes = fig.to_image(format="png", width=600, height=350, scale=2)
# Image(img_bytes)
fig
In [14]:
# topic_model.visualize_barchart(top_n_topics = 20, n_words=10,width = 300, height= 300)
In [15]:
fig2 = topic_model.visualize_heatmap()
# fig = topic_model.visualize_barchart(top_n_topics = 20, n_words=10, width = 300, height= 300)
pio.write_image(fig2, '/home/zhhuang/climate_policy_paper/paper_images/topic_country_expand_heatmap.svg')
fig2
In [16]:
fig3 = topic_model.visualize_topics()
pio.write_image(fig3, '/home/zhhuang/climate_policy_paper/paper_images/topic_country_expand_visualize_topics.svg')
fig3
In [17]:
hierarchical_topics = topic_model.hierarchical_topics(docs)
# print(hierarchical_topics)
with pd.ExcelWriter("Topic_country_expand_hierarchical_topics.xlsx", engine='xlsxwriter',
                    engine_kwargs={'options': {'strings_to_urls': False}}) as writer:
    hierarchical_topics.to_excel(writer)
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 58/58 [20:37<00:00, 21.34s/it]
In [18]:
hierarchical_topics = pd.read_excel("/home/zhhuang/climate_policy_paper/code/Topic_country_expand_hierarchical_topics.xlsx")

fig4 = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
pio.write_image(fig4, '/home/zhhuang/climate_policy_paper/paper_images/topic_country_expand_hierarchical_topics.svg')
fig4
In [19]:
for index, i in enumerate(timestamp):
    if i == '0':
        timestamp[index] = '2020'
    else:
        timestamp[index] = str(i)
topics_over_time = topic_model.topics_over_time(docs, timestamp, datetime_format="%Y", nr_bins=20)
with pd.ExcelWriter("Topic_country_expand_topics_over_time.xlsx", engine='xlsxwriter',
                    engine_kwargs={'options': {'strings_to_urls': False}}) as writer:
    topics_over_time.to_excel(writer)
19it [2:29:15, 471.35s/it]
In [20]:
topics_over_time = pd.read_excel("/home/zhhuang/climate_policy_paper/code/Topic_country_expand_topics_over_time.xlsx")

fig5 = topic_model.visualize_topics_over_time(topics_over_time)
pio.write_image(fig5, '/home/zhhuang/climate_policy_paper/paper_images/topic_country_expand_visualize_topics_over_time.svg')
fig5
In [ ]: